Binary DVs: Logit/Probit and the LPM

November 21, 2024

Packages

library(haven)
library(tidyverse)
library(jtools)
library(marginaleffects)
library(skimr)
library(estimatr)

nes <- read_dta("nes.dta")

Binary Dependent Variable: Vote Choice

# Summarize key variables in vote choice model

# Dependent variable: 1=Clinton, 0=Trump
nes |> drop_na(Clinton_vote) |> group_by(Clinton_vote) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 2 × 3
  Clinton_vote     n   pct
  <dbl+lbl>    <int> <dbl>
1 0 [Trump]     1245  47.7
2 1 [Clinton]   1364  52.3

Inependent Variables

# Independent variables
nes |> group_by(better_worse_past_econ) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 6 × 3
  better_worse_past_econ     n    pct
  <dbl+lbl>              <int>  <dbl>
1  1 [Much better]         249  5.83 
2  2 [Somwht better]       951 22.3  
3  3 [Abt same]           1806 42.3  
4  4 [Somwht worse]        670 15.7  
5  5 [Much worse]          576 13.5  
6 NA                        19  0.445
nes |> group_by(partyid7) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 8 × 3
  partyid7          n    pct
  <dbl+lbl>     <int>  <dbl>
1  1 [StrngDem]   890 20.8  
2  2 [WkDem]      560 13.1  
3  3 [IndDem]     490 11.5  
4  4 [Indep]      579 13.6  
5  5 [IndRep]     500 11.7  
6  6 [WkRep]      508 11.9  
7  7 [StrngRep]   721 16.9  
8 NA               23  0.539

Inependent Variables

# Independent variables
nes |> group_by(Relig_attend) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 7 × 3
  Relig_attend         n    pct
  <dbl+lbl>        <int>  <dbl>
1  0 [Never]        1749 41.0  
2  1 [Few tmes/yr]   704 16.5  
3  2 [1-2/ mnth]     471 11.0  
4  3 [Almst ev wk]   536 12.5  
5  4 [Every wk]      426  9.97 
6  5 [>Every wk]     362  8.48 
7 NA                  23  0.539
nes |> group_by(Race3) |> summarize(n=n()) |> mutate(pct=100*n/sum(n))
# A tibble: 4 × 3
  Race3             n   pct
  <dbl+lbl>     <int> <dbl>
1  1 [White]     3038 71.1 
2  2 [Black]      398  9.32
3  3 [Hispanic]   450 10.5 
4 NA              385  9.01

Logit Model

# Logit model using "glm" 
vote_l <- glm(Clinton_vote ~ better_worse_past_econ + 
            partyid7 + Relig_attend + as.factor(Race3), 
            family=binomial(link=logit), data=nes)
summary(vote_l, digits=4)

Call:
glm(formula = Clinton_vote ~ better_worse_past_econ + partyid7 + 
    Relig_attend + as.factor(Race3), family = binomial(link = logit), 
    data = nes)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)             7.40859    0.35695  20.756  < 2e-16 ***
better_worse_past_econ -1.08213    0.09016 -12.003  < 2e-16 ***
partyid7               -1.02725    0.04438 -23.146  < 2e-16 ***
Relig_attend           -0.20739    0.04616  -4.493 7.03e-06 ***
as.factor(Race3)2       2.29559    0.40660   5.646 1.64e-08 ***
as.factor(Race3)3       1.36818    0.27240   5.023 5.10e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3340.5  on 2410  degrees of freedom
Residual deviance: 1203.7  on 2405  degrees of freedom
  (1860 observations deleted due to missingness)
AIC: 1215.7

Number of Fisher Scoring iterations: 6

Goodness of Fit: PRE

# Goodness of fit: Proportional reduction in error (PRE)
# First, save "fitted values" (predicted prob(Clinton) for each observation); 
# generate whether observation is "correctly predicted" or not (using .5 threshold).
# Using the "fitted" command requires removing all missing data. 
# Note I create new data object, "nes_fit"
nes_fit <- nes |> 
  drop_na(Clinton_vote, better_worse_past_econ, partyid7, Relig_attend, Race3) |> 
  mutate(prob=fitted(vote_l), 
         correct=ifelse(prob>.5 & Clinton_vote==1, 1, 0), 
         correct=ifelse(prob<.5 & Clinton_vote==0, 1, correct))

Goodness of Fit: PRE

# Second, generate three core measures -- PMC, PCP, and PRE -- from the "nes_fit" object
# NOTE: if "1" is the modal category of DV (which it is in this case), use code below for "PMC": PMC=mean(Clinton_vote)
# If "0" is the modal category of DV, use: PMC=1-mean(Clinton_vote)
nes_fit |> summarize(PMC=mean(Clinton_vote), PCP=mean(correct)) |> 
  mutate(PRE = (PCP-PMC) / (1-PMC) )
# A tibble: 1 × 3
    PMC   PCP   PRE
  <dbl> <dbl> <dbl>
1 0.514 0.904 0.803

Pred. Probs.: Avg. Case Approach

  • Ecomomic perceptions
## ECONOMIC PERCEPTIONS

## AVG CASE APPROACH 
econ_ac_l <- predictions(
  vote_l,
  by = "better_worse_past_econ",
  numderiv = "richardson",
  newdata = datagrid(better_worse_past_econ = 1:5, 
                     grid_type = "counterfactual"))
head(econ_ac_l)

 better_worse_past_econ Estimate Pr(>|z|)     S  2.5 % 97.5 %
                      1    0.915  < 0.001 102.4 0.8783  0.941
                      2    0.785  < 0.001  80.1 0.7401  0.823
                      3    0.552  0.00905   6.8 0.5131  0.591
                      4    0.295  < 0.001  44.0 0.2499  0.344
                      5    0.124  < 0.001  78.7 0.0887  0.171

Columns: better_worse_past_econ, estimate, p.value, s.value, conf.low, conf.high 
Type:  invlink(link) 

Pred. Probs.: Obs. Value Approach

  • Ecomomic perceptions
## OBSERVED VALUE APPROACH 
econ_ov_l <- predictions(
  vote_l,
  type = "response",
  by = "better_worse_past_econ",
  newdata = datagrid(better_worse_past_econ = 1:5, 
                     grid_type = "counterfactual"))
head(econ_ov_l)

 better_worse_past_econ Estimate Std. Error    z Pr(>|z|)     S 2.5 % 97.5 %
                      1    0.726     0.0203 35.7   <0.001 924.1 0.686  0.765
                      2    0.621     0.0118 52.4   <0.001   Inf 0.598  0.644
                      3    0.520     0.0069 75.5   <0.001   Inf 0.507  0.534
                      4    0.419     0.0114 36.8   <0.001 983.4 0.397  0.441
                      5    0.311     0.0201 15.5   <0.001 178.1 0.272  0.351

Columns: better_worse_past_econ, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high 
Type:  response 

Pred. Probs.: Avg. Case Approach

  • Race
## RACE

## AVG CASE
race_ac_l <- predictions(
  vote_l, 
  by = "Race3",
  numderiv = "richardson",
  newdata = datagrid(Race3 = 1:3, 
                     grid_type = "counterfactual"))
head(race_ac_l)

 Race3 Estimate Pr(>|z|)    S 2.5 % 97.5 %
     1    0.451   0.0163  5.9 0.412  0.491
     2    0.891   <0.001 22.9 0.789  0.947
     3    0.764   <0.001 17.3 0.660  0.843

Columns: Race3, estimate, p.value, s.value, conf.low, conf.high 
Type:  invlink(link) 

Pred. Probs.: Obs. Value Approach

  • Race
## OBSERVED VALUE
race_ov_l <- predictions(
  vote_l, 
  type = "response",
  by = "Race3",
  numderiv = "richardson",
  newdata = datagrid(Race3 = 1:3, 
                     grid_type = "counterfactual"))
head(race_ov_l)

 Race3 Estimate Std. Error    z Pr(>|z|)     S 2.5 % 97.5 %
     1    0.493    0.00654 75.4   <0.001   Inf 0.480  0.506
     2    0.683    0.03475 19.7   <0.001 283.3 0.615  0.751
     3    0.604    0.02151 28.1   <0.001 574.5 0.562  0.646

Columns: Race3, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high 
Type:  response 

Compare Avg. Case to Obs. Value, Logit

# Compare AC to OV: Econ Perceptions

ggplot(econ_ac_l, aes(x=better_worse_past_econ, y=estimate)) +
  geom_line(color="black") +
  geom_line(data=econ_ov_l, aes(x=better_worse_past_econ, y=estimate), color="dodgerblue") +
  labs(x="Economic perceptions", y="Predicted Prob(Clinton Vote)") +
  geom_text(x=2, y=.86, label="Avg. case", color="black") +
  geom_text(x=1.5, y=.6, label="Obs. value", color="dodgerblue") +
  scale_x_continuous(breaks=c(1:5), labels=c("Better", "2", "3", "4", "Worse")) +
  scale_y_continuous(limits = c(.1, 1), breaks = seq(.1, 1, by = .10))

Compare Avg. Case to Obs. Value, Logit

# Compare AC to OV: Race

ggplot(race_ac_l, aes(x=as.numeric(Race3), y=estimate)) +
  geom_line(color="black") + 
  geom_line(data=race_ov_l, aes(x=as.numeric(Race3), y=estimate), color="dodgerblue") +
  labs(x="", y="Predicted Prob(Clinton Vote)") +
  geom_text(x=2, y=.93, label="Avg. case", color="black") +
  geom_text(x=1.5, y=.5, label="Obs. value", color="dodgerblue") +
  scale_x_continuous(breaks=c(1:3), labels=c("White", "Black", "Hispanic")) +
  scale_y_continuous(limits = c(.1, 1), breaks = seq(.1, 1, by = .10))

Probit Model

# Probit
vote_p <- glm(Clinton_vote ~ better_worse_past_econ + 
                partyid7 + Relig_attend + as.factor(Race3), 
              family=binomial(link=probit), data=nes)
summary(vote_p, digits=4)

Call:
glm(formula = Clinton_vote ~ better_worse_past_econ + partyid7 + 
    Relig_attend + as.factor(Race3), family = binomial(link = probit), 
    data = nes)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)             4.07811    0.17785  22.930  < 2e-16 ***
better_worse_past_econ -0.58227    0.04767 -12.215  < 2e-16 ***
partyid7               -0.57227    0.02225 -25.716  < 2e-16 ***
Relig_attend           -0.11744    0.02476  -4.743 2.10e-06 ***
as.factor(Race3)2       1.19918    0.20845   5.753 8.77e-09 ***
as.factor(Race3)3       0.74229    0.14804   5.014 5.32e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3340.5  on 2410  degrees of freedom
Residual deviance: 1205.1  on 2405  degrees of freedom
  (1860 observations deleted due to missingness)
AIC: 1217.1

Number of Fisher Scoring iterations: 6

Pred. Probs.: Avg. Case Approach

# ECON PERCEPTIONS 

# AVERAGE CASE APPROACH
econ_ac_p <- predictions(
  vote_p,
  by = "better_worse_past_econ",
  numderiv = "richardson",
  newdata = datagrid(better_worse_past_econ = 1:5, 
                     grid_type = "counterfactual"))
head(econ_ac_p)

 better_worse_past_econ Estimate Pr(>|z|)     S 2.5 % 97.5 %
                      1    0.900  < 0.001 106.7 0.858  0.932
                      2    0.758  < 0.001  83.8 0.715  0.796
                      3    0.546  0.00646   7.3 0.513  0.579
                      4    0.321  < 0.001  44.4 0.279  0.365
                      5    0.147  < 0.001  80.5 0.106  0.198

Columns: better_worse_past_econ, estimate, p.value, s.value, conf.low, conf.high 
Type:  invlink(link) 

Pred. Probs.: Obs. Value Approach

## OBSERVED VALUE APPROACH 
econ_ov_p <- predictions(
  vote_p,
  type = "response",
  by = "better_worse_past_econ",
  newdata = datagrid(better_worse_past_econ = 1:5, 
                     grid_type = "counterfactual"))
head(econ_ov_p)

 better_worse_past_econ Estimate Std. Error    z Pr(>|z|)      S 2.5 % 97.5 %
                      1    0.722    0.01948 37.1   <0.001  997.8 0.684  0.761
                      2    0.621    0.01174 52.9   <0.001    Inf 0.598  0.644
                      3    0.521    0.00692 75.3   <0.001    Inf 0.508  0.535
                      4    0.420    0.01130 37.2   <0.001 1004.0 0.398  0.443
                      5    0.316    0.01933 16.4   <0.001  197.6 0.278  0.354

Columns: better_worse_past_econ, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high 
Type:  response 

Pred. Probs.: Avg. Case Approach

## RACE

## AVG CASE
race_ac_p <- predictions(
  vote_p, 
  by = "Race3",
  numderiv = "richardson",
  newdata = datagrid(Race3 = 1:3, 
                     grid_type = "counterfactual"))
head(race_ac_p)

 Race3 Estimate Pr(>|z|)    S 2.5 % 97.5 %
     1    0.461   0.0243  5.4 0.427  0.495
     2    0.865   <0.001 24.0 0.759  0.933
     3    0.740   <0.001 17.6 0.643  0.821

Columns: Race3, estimate, p.value, s.value, conf.low, conf.high 
Type:  invlink(link) 

Pred. Probs.: Obs. Value Approach

## OBSERVED VALUE
race_ov_p <- predictions(
  vote_p, 
  type = "response",
  by = "Race3",
  numderiv = "richardson",
  newdata = datagrid(Race3 = 1:3, 
                     grid_type = "counterfactual"))
head(race_ov_p)

 Race3 Estimate Std. Error    z Pr(>|z|)     S 2.5 % 97.5 %
     1    0.494    0.00658 75.1   <0.001   Inf 0.481  0.507
     2    0.675    0.03183 21.2   <0.001 329.4 0.613  0.738
     3    0.606    0.02143 28.3   <0.001 581.5 0.564  0.648

Columns: Race3, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high 
Type:  response 

Compare Logit and Probit

# Compare logit v. probit probs (obs. value): Econ
ggplot(econ_ov_l, aes(x=better_worse_past_econ, y=estimate)) +
  geom_line(color="black") +
  geom_line(data=econ_ov_p, aes(x=better_worse_past_econ, y=estimate), color="dodgerblue") +
  labs(x="Economic perceptions", y="Predicted Prob(Clinton Vote)") +
  geom_text(x=2, y=.7, label="Logit", color="black") +
  geom_text(x=1.5, y=.6, label="Probit", color="dodgerblue") +
  scale_x_continuous(breaks=c(1:5), labels=c("Better", "2", "3", "4", "Worse")) +
  scale_y_continuous(limits = c(.1, 1), breaks = seq(.1, 1, by = .10))

Compare Logit and Probit

# Compare logit v. probit probs (obs. value): Race
ggplot(race_ov_l, aes(x=as.numeric(Race3), y=estimate)) +
  geom_line(color="black") + 
  geom_line(data=race_ov_p, aes(x=as.numeric(Race3), y=estimate), color="dodgerblue") +
  labs(x="", y="Predicted Prob(Clinton Vote)") +
  geom_text(x=2, y=.75, label="Logit", color="black") +
  geom_text(x=1.5, y=.5, label="Probit", color="dodgerblue") +
  scale_x_continuous(breaks=c(1:3), labels=c("White", "Black", "Hispanic")) +
  scale_y_continuous(limits = c(.1, 1), breaks = seq(.1, 1, by = .10))

Linear Probability Model (LPM)

# LPM with robust standard errors; use lm_robust from estimatr package
vote_lpm <- lm_robust(Clinton_vote ~ better_worse_past_econ + 
              partyid7 + Relig_attend + as.factor(Race3), data=nes, 
              se_type="stata")
summary(vote_lpm)

Call:
lm_robust(formula = Clinton_vote ~ better_worse_past_econ + partyid7 + 
    Relig_attend + as.factor(Race3), data = nes, se_type = "stata")

Standard error type:  HC1 

Coefficients:
                       Estimate Std. Error t value   Pr(>|t|) CI Lower
(Intercept)             1.32678   0.016381  80.993  0.000e+00  1.29466
better_worse_past_econ -0.08695   0.006912 -12.580  3.420e-35 -0.10051
partyid7               -0.14105   0.003353 -42.064 2.677e-290 -0.14762
Relig_attend           -0.01670   0.003465  -4.818  1.537e-06 -0.02349
as.factor(Race3)2       0.12509   0.017849   7.008  3.122e-12  0.09009
as.factor(Race3)3       0.11881   0.020582   5.773  8.805e-09  0.07845
                        CI Upper   DF
(Intercept)             1.358907 2405
better_worse_past_econ -0.073398 2405
partyid7               -0.134473 2405
Relig_attend           -0.009902 2405
as.factor(Race3)2       0.160087 2405
as.factor(Race3)3       0.159175 2405

Multiple R-squared:  0.6618 ,   Adjusted R-squared:  0.6611 
F-statistic:  2420 on 5 and 2405 DF,  p-value: < 2.2e-16

Pred. Probs. from LPM

# Pred. probs: Econ
econ_lpm <- predictions(
  vote_lpm, 
  by = "better_worse_past_econ",
  df = insight::get_df(vote_lpm),
  numderiv = "richardson",
  newdata = datagrid(better_worse_past_econ = 1:5, 
                     grid_type="counterfactual"))
head(econ_lpm)

 better_worse_past_econ Estimate Std. Error    t Pr(>|t|)     S 2.5 % 97.5 %
                      1    0.691    0.01505 45.9   <0.001   Inf 0.662  0.721
                      2    0.605    0.00911 66.3   <0.001   Inf 0.587  0.622
                      3    0.518    0.00592 87.4   <0.001   Inf 0.506  0.529
                      4    0.431    0.00909 47.4   <0.001   Inf 0.413  0.448
                      5    0.344    0.01503 22.9   <0.001 346.2 0.314  0.373
   Df
 2405
 2405
 2405
 2405
 2405

Columns: better_worse_past_econ, estimate, std.error, statistic, p.value, s.value, conf.low, conf.high, df 
Type:  response 

Compare Logit and LPM

# Compare logit (obs. value) and LPM
ggplot(econ_ov_l, aes(x=better_worse_past_econ, y=estimate)) +
  geom_line(color="black") +
  geom_line(data=econ_lpm, aes(x=better_worse_past_econ, y=estimate), color="dodgerblue") +
  labs(x="Economic perceptions", y="Predicted Prob(Clinton Vote)") +
  geom_text(x=2, y=.7, label="Logit (Obs. Value)", color="black") +
  geom_text(x=1.5, y=.6, label="LPM", color="dodgerblue") +
  scale_x_continuous(breaks=c(1:5), labels=c("Better", "2", "3", "4", "Worse")) +
  scale_y_continuous(limits = c(.1, 1), breaks = seq(.1, 1, by = .10))

Compare Probit and LPM

# Compare probit (OV) and LPM
ggplot(econ_ov_p, aes(x=better_worse_past_econ, y=estimate)) +
  geom_line(color="black") +
  geom_line(data=econ_lpm, aes(x=better_worse_past_econ, y=estimate), color="dodgerblue") +
  labs(x="Economic perceptions", y="Predicted Prob(Clinton Vote)") +
  geom_text(x=2, y=.7, label="Probit (Obs. Value)", color="black") +
  geom_text(x=1.5, y=.6, label="LPM", color="dodgerblue") +
  scale_x_continuous(breaks=c(1:5), labels=c("Better", "2", "3", "4", "Worse")) +
  scale_y_continuous(limits = c(.1, 1), breaks = seq(.1, 1, by = .10))